##1 Load data ---------------------
library(readr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.1.8.1
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(nnet)
library(e1071)
library(class)
library(reshape2)
library(RColorBrewer)
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(gains)
sleep_data <- read_csv("Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI.Category, Blood.Pressure, Sleep.Disorder
## dbl (8): Person.ID, Age, Sleep.Duration, Quality.of.Sleep, Physical.Activity...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##2 explore data ---------------------
# basic analysis
names(sleep_data)
## [1] "Person.ID" "Gender"
## [3] "Age" "Occupation"
## [5] "Sleep.Duration" "Quality.of.Sleep"
## [7] "Physical.Activity.Level" "Stress.Level"
## [9] "BMI.Category" "Blood.Pressure"
## [11] "Heart.Rate" "Daily.Steps"
## [13] "Sleep.Disorder"
head(sleep_data)
## # A tibble: 6 × 13
## Person.ID Gender Age Occupation Sleep.Duration Quality.of.Sleep
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representative 5.9 4
## 5 5 Male 28 Sales Representative 5.9 4
## 6 6 Male 28 Software Engineer 5.9 4
## # ℹ 7 more variables: Physical.Activity.Level <dbl>, Stress.Level <dbl>,
## # BMI.Category <chr>, Blood.Pressure <chr>, Heart.Rate <dbl>,
## # Daily.Steps <dbl>, Sleep.Disorder <chr>
str(sleep_data)
## spc_tbl_ [374 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Person.ID : num [1:374] 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr [1:374] "Male" "Male" "Male" "Male" ...
## $ Age : num [1:374] 27 28 28 28 28 28 29 29 29 29 ...
## $ Occupation : chr [1:374] "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
## $ Sleep.Duration : num [1:374] 6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
## $ Quality.of.Sleep : num [1:374] 6 6 6 4 4 4 6 7 7 7 ...
## $ Physical.Activity.Level: num [1:374] 42 60 60 30 30 30 40 75 75 75 ...
## $ Stress.Level : num [1:374] 6 8 8 8 8 8 7 6 6 6 ...
## $ BMI.Category : chr [1:374] "Overweight" "Normal" "Normal" "Obese" ...
## $ Blood.Pressure : chr [1:374] "126/83" "125/80" "125/80" "140/90" ...
## $ Heart.Rate : num [1:374] 77 75 75 85 85 85 82 70 70 70 ...
## $ Daily.Steps : num [1:374] 4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
## $ Sleep.Disorder : chr [1:374] "None" "None" "None" "Sleep Apnea" ...
## - attr(*, "spec")=
## .. cols(
## .. Person.ID = col_double(),
## .. Gender = col_character(),
## .. Age = col_double(),
## .. Occupation = col_character(),
## .. Sleep.Duration = col_double(),
## .. Quality.of.Sleep = col_double(),
## .. Physical.Activity.Level = col_double(),
## .. Stress.Level = col_double(),
## .. BMI.Category = col_character(),
## .. Blood.Pressure = col_character(),
## .. Heart.Rate = col_double(),
## .. Daily.Steps = col_double(),
## .. Sleep.Disorder = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(sleep_data)
## Person.ID Gender Age Occupation
## Min. : 1.00 Length:374 Min. :27.00 Length:374
## 1st Qu.: 94.25 Class :character 1st Qu.:35.25 Class :character
## Median :187.50 Mode :character Median :43.00 Mode :character
## Mean :187.50 Mean :42.18
## 3rd Qu.:280.75 3rd Qu.:50.00
## Max. :374.00 Max. :59.00
## Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## Length:374 Length:374 Min. :65.00 Min. : 3000
## Class :character Class :character 1st Qu.:68.00 1st Qu.: 5600
## Mode :character Mode :character Median :70.00 Median : 7000
## Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep.Disorder
## Length:374
## Class :character
## Mode :character
##
##
##
#2.1 explore prediction target-sleep.disorder
unique_values <- unique(sleep_data$Sleep.Disorder)
cat(paste("The outputs from the classification are:", toString(unique_values)))
## The outputs from the classification are: None, Sleep Apnea, Insomnia
table(sleep_data$Sleep.Disorder)
##
## Insomnia None Sleep Apnea
## 77 219 78
#plot
ggplot(sleep_data, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
geom_histogram(stat="count", position=position_dodge(width=10)) +
scale_fill_manual(values=c('#EBDEF0', '#4A235A', '#C39BD3')) +
labs(title='Distribution of persons have sleep disorder or not') +
theme_minimal() +
theme(plot.background = element_rect(fill='white'),
panel.background = element_rect(fill='white'),
plot.title = element_text(size=12, hjust = 0,face="bold"),
axis.title.x = element_text(size=10),
axis.title.y = element_text(size=10),
axis.text.x = element_text(size=10,face="bold"),
axis.text.y = element_text(size=10,face="bold"),
legend.title = element_text(size=10,face="bold"))
## Warning in geom_histogram(stat = "count", position = position_dodge(width =
## 10)): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

#2.2 gender
unique_gender <- unique(sleep_data$Gender)
cat(paste("The values of Sex column are:", toString(unique_gender)))
## The values of Sex column are: Male, Female
gender_disorder_counts <- sleep_data %>%
group_by(Sleep.Disorder, Gender) %>%
summarise(Count = n(),.groups = 'keep')
# Calculate the percentage for each gender within each Sleep.Disorder category
gender_disorder_counts <- sleep_data %>%
group_by(Sleep.Disorder, Gender) %>%
summarise(Count = n(), .groups = 'keep') %>%
group_by(Sleep.Disorder) %>%
mutate(Total = sum(Count)) %>%
ungroup() %>%
mutate(Percentage = Count / Total * 100)
# Plot
ggplot(gender_disorder_counts, aes(x="", y=Percentage, fill=Gender)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
facet_wrap(~ Sleep.Disorder) +
scale_fill_manual(values=c('#C39BD3', '#EBDEF0')) +
labs(title="The relationship between Sex and Sleep Disorder") +
theme_void() +
theme(legend.position="bottom") # Adjust legend position if desired

##3 data pre-process ---------------------
##3.1 check missing-values
sum(is.na(sleep_data))
## [1] 0
#drop person.ID
sleep_data <- sleep_data[, !names(sleep_data) %in% 'Person.ID']
#drop Heart.Rate because everyone's heart rate is within normal range
sleep_data <- sleep_data[, !names(sleep_data) %in% 'Heart.Rate']
#1:"sleep apnea" or "insomnia," and 0: no sleep problems
sleep_data$Sleep.Disorder <- as.numeric(sleep_data$Sleep.Disorder %in% c("Sleep Apnea", "Insomnia"))
##3.2 transfer Blood.Pressure
unique(sleep_data$Blood.Pressure)
## [1] "126/83" "125/80" "140/90" "120/80" "132/87" "130/86" "117/76" "118/76"
## [9] "128/85" "131/86" "128/84" "115/75" "135/88" "129/84" "130/85" "115/78"
## [17] "119/77" "121/79" "125/82" "135/90" "122/80" "142/92" "140/95" "139/91"
## [25] "118/75"
#categorize blood pressure
categorize_blood_pressure <- function(bp) {
bp_parts <- strsplit(bp, split = "/")
sys <- as.numeric(bp_parts[[1]][1])
dias <- as.numeric(bp_parts[[1]][2])
if (sys < 90 || dias < 60) {
result <- 'Low'
} else if (sys < 120 && dias < 80) {
result <- 'Normal'
} else if (sys >= 120 && sys < 130 && dias < 80) {
result <- 'Elevated'
} else if ((sys >= 130 && sys < 140) || (dias >= 80 && dias < 90)) {
result <- 'Hypertension Stage 1'
} else if (sys >= 140 || dias >= 90) {
result <- 'Hypertension Stage 2'
} else if (sys > 180 || dias > 120) {
result <- 'Hypertensive Crisis'
} else {
result <- 'Unknown'
}
return(result)
}
sleep_data$Blood.Pressure.Category <- sapply(sleep_data$Blood.Pressure, categorize_blood_pressure)
print(unique(sleep_data$Blood.Pressure.Category))
## [1] "Hypertension Stage 1" "Hypertension Stage 2" "Normal"
## [4] "Elevated"
#3.3 trasnfer Age into categoriable variable
# categorize age
sleep_data$Age_Group <- cut(
x = sleep_data$Age,
breaks = c(0, 16, 30, 45, 100),
labels = c('Child', 'Young Adults', 'Middle-aged Adults', 'Old Adults'),
include.lowest = TRUE # Include the lowest value in the first bin
)
head(sleep_data$Age_Group)
## [1] Young Adults Young Adults Young Adults Young Adults Young Adults
## [6] Young Adults
## Levels: Child Young Adults Middle-aged Adults Old Adults
#3.4 BMI.Category data-cleaning
sleep_data$BMI.Category[sleep_data$BMI.Category == "Normal Weight"] <- "Normal"
unique(sleep_data$BMI.Category)
## [1] "Overweight" "Normal" "Obese"
#3.5 cut separate processing
sleep_data$Daily.Steps=cut(sleep_data$Daily.Steps,4)
sleep_data$Sleep.Duration=cut(sleep_data$Sleep.Duration,3)
sleep_data$Physical.Activity.Level=cut(sleep_data$Physical.Activity.Level,4)
#3.6 Use tag encoder
categories <- c('Gender','Occupation','Sleep.Duration',
'Physical.Activity.Level','BMI.Category',
'Daily.Steps','Age_Group',
'Blood.Pressure.Category')
for (label in categories) {
sleep_data[[label]] <- as.numeric(as.factor(sleep_data[[label]]))
}
#3.7 delete Age and Blood.Pressure column, then all the variable are numeric and classified
sleep_data1 <- sleep_data[,-c(2,9)]
#3.8 corrplot
cor_matrix <- cor(sleep_data1)
color_palette <- colorRampPalette(c("pink",'white', "#4A235A"))(200)
corrplot(cor_matrix, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, col = color_palette,
tl.cex = 0.9)
## Warning in ind1:ind2: numerical expression has 2 elements: only the first used

##4 split data ---------------------
set.seed(20231124)
train_index <- sample(seq_len(nrow(sleep_data1)), size = floor(0.60 * nrow(sleep_data1)))
train_data <- sleep_data1[train_index, ]
x_train <- as.matrix(train_data[, -which(names(train_data) == "Sleep.Disorder")])
y_train <- train_data$Sleep.Disorder
test_data <- sleep_data1[-train_index, ]
x_test <- as.matrix(test_data[, -which(names(test_data) == "Sleep.Disorder")])
y_test <- as.numeric(test_data$Sleep.Disorder)
##5 Logistic Regression Model ---------------------
#Train
LR_model <- glm(Sleep.Disorder ~ ., data=train_data, family="binomial")
#Test
LR_predictions <- predict(LR_model, newdata = as.data.frame(x_test), type = "response")
LR_class_predictions <- ifelse(LR_predictions > 0.5, 1, 0)
#Evaluate
LR_confusion <- confusionMatrix(as.factor(LR_class_predictions), as.factor(as.numeric(y_test)))
LR_confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 77 6
## 1 11 56
##
## Accuracy : 0.8867
## 95% CI : (0.8248, 0.9326)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : 6.187e-16
##
## Kappa : 0.7691
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.8750
## Specificity : 0.9032
## Pos Pred Value : 0.9277
## Neg Pred Value : 0.8358
## Prevalence : 0.5867
## Detection Rate : 0.5133
## Detection Prevalence : 0.5533
## Balanced Accuracy : 0.8891
##
## 'Positive' Class : 0
##
#the lift chart
LR_gain <- gains(test_data$Sleep.Disorder, LR_predictions, groups=10)
summary(LR_gain)
## Length Class Mode
## depth 10 -none- numeric
## obs 10 -none- numeric
## cume.obs 10 -none- numeric
## mean.resp 10 -none- numeric
## cume.mean.resp 10 -none- numeric
## cume.pct.of.total 10 -none- numeric
## lift 10 -none- numeric
## cume.lift 10 -none- numeric
## mean.prediction 10 -none- numeric
## min.prediction 10 -none- numeric
## max.prediction 10 -none- numeric
## conf 1 -none- character
## optimal 1 -none- logical
## num.groups 1 -none- numeric
## percents 1 -none- logical
plot(LR_gain)

LR_preds <- as.numeric(LR_predictions>0.5)
LR_truth <- test_data$Sleep.Disorder
LR_probs <- LR_predictions
LR_preds <- as.numeric(LR_probs > 0.5)
LR_nick <- cbind(LR_truth, LR_probs, LR_preds)
# plot lift chart
plot(c(0,LR_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,LR_gain$cume.obs),
xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

##6 XGBoost Model ---------------------
#Train
xgb_model <- xgboost(data = x_train, label = as.numeric(y_train), nrounds = 100, objective = "binary:logistic")
## [1] train-logloss:0.512365
## [2] train-logloss:0.408083
## [3] train-logloss:0.343427
## [4] train-logloss:0.300811
## [5] train-logloss:0.272429
## [6] train-logloss:0.252566
## [7] train-logloss:0.239665
## [8] train-logloss:0.230768
## [9] train-logloss:0.222319
## [10] train-logloss:0.215232
## [11] train-logloss:0.210969
## [12] train-logloss:0.204953
## [13] train-logloss:0.200605
## [14] train-logloss:0.197245
## [15] train-logloss:0.194177
## [16] train-logloss:0.192157
## [17] train-logloss:0.190653
## [18] train-logloss:0.189419
## [19] train-logloss:0.187819
## [20] train-logloss:0.186768
## [21] train-logloss:0.185812
## [22] train-logloss:0.184657
## [23] train-logloss:0.183728
## [24] train-logloss:0.182829
## [25] train-logloss:0.181914
## [26] train-logloss:0.181002
## [27] train-logloss:0.180396
## [28] train-logloss:0.179434
## [29] train-logloss:0.178251
## [30] train-logloss:0.177576
## [31] train-logloss:0.176737
## [32] train-logloss:0.176133
## [33] train-logloss:0.175445
## [34] train-logloss:0.174910
## [35] train-logloss:0.174515
## [36] train-logloss:0.174158
## [37] train-logloss:0.173857
## [38] train-logloss:0.173424
## [39] train-logloss:0.173178
## [40] train-logloss:0.172921
## [41] train-logloss:0.172570
## [42] train-logloss:0.172283
## [43] train-logloss:0.171893
## [44] train-logloss:0.171619
## [45] train-logloss:0.171323
## [46] train-logloss:0.171076
## [47] train-logloss:0.170834
## [48] train-logloss:0.170636
## [49] train-logloss:0.170443
## [50] train-logloss:0.170218
## [51] train-logloss:0.169823
## [52] train-logloss:0.169540
## [53] train-logloss:0.169351
## [54] train-logloss:0.169177
## [55] train-logloss:0.168929
## [56] train-logloss:0.168791
## [57] train-logloss:0.168630
## [58] train-logloss:0.168495
## [59] train-logloss:0.168368
## [60] train-logloss:0.168243
## [61] train-logloss:0.168029
## [62] train-logloss:0.167913
## [63] train-logloss:0.167674
## [64] train-logloss:0.167564
## [65] train-logloss:0.167419
## [66] train-logloss:0.167249
## [67] train-logloss:0.167128
## [68] train-logloss:0.167030
## [69] train-logloss:0.166862
## [70] train-logloss:0.166694
## [71] train-logloss:0.166592
## [72] train-logloss:0.166479
## [73] train-logloss:0.166351
## [74] train-logloss:0.166267
## [75] train-logloss:0.166134
## [76] train-logloss:0.166056
## [77] train-logloss:0.165942
## [78] train-logloss:0.165839
## [79] train-logloss:0.165769
## [80] train-logloss:0.165703
## [81] train-logloss:0.165600
## [82] train-logloss:0.165462
## [83] train-logloss:0.165389
## [84] train-logloss:0.165301
## [85] train-logloss:0.165187
## [86] train-logloss:0.165119
## [87] train-logloss:0.165054
## [88] train-logloss:0.164931
## [89] train-logloss:0.164867
## [90] train-logloss:0.164763
## [91] train-logloss:0.164669
## [92] train-logloss:0.164605
## [93] train-logloss:0.164525
## [94] train-logloss:0.164460
## [95] train-logloss:0.164395
## [96] train-logloss:0.164337
## [97] train-logloss:0.164271
## [98] train-logloss:0.164189
## [99] train-logloss:0.164130
## [100] train-logloss:0.164084
#Test
xgb_predictions <- predict(xgb_model, newdata = as.matrix(x_test))
xgb_class_predictions <- ifelse(xgb_predictions > 0.5, 1, 0)
# visualize result
importance_matrix <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix)

# adjust params
params <- list(
booster = "gbtree",
objective = "binary:logistic",
eta = 0.3,
max_depth = 6
)
results <- data.frame(eta = numeric(), max_depth = integer(), test_auc_mean = numeric())
for (eta_val in seq(0.01, 0.3, by = 0.05)) {
for (depth in 3:8) {
params$eta <- eta_val
params$max_depth <- depth
cv_model <- xgb.cv(
params = params,
data = x_train,
label = as.numeric(y_train),
nrounds = 100,
nfold = 5,
metrics = "auc",
early_stopping_rounds = 10,
verbose = FALSE
)
best_iteration <- cv_model$best_iteration
best_test_auc_mean <- cv_model$evaluation_log$test_auc_mean[best_iteration]
results <- rbind(results, data.frame(eta = eta_val,
max_depth = depth,
test_auc_mean = best_test_auc_mean,
nrounds = best_iteration))
}
}
#plot result
ggplot(results, aes(x = max_depth, y = test_auc_mean, group = eta, color = as.factor(eta))) +
geom_line() +
geom_point() +
labs(title = "Performance of Different max_depth and eta Combinations",
x = "Max Depth",
y = "Test AUC Mean",
color = "Eta Value") +
theme_minimal()

best_params <- params
best_params$eta <- 0.06
best_params$max_depth <- 5
# retrain best xgboost model
xgboost_model <- xgboost(
params = best_params,
data = x_train,
label = as.numeric(y_train),
nrounds = 100
)
## [1] train-logloss:0.652276
## [2] train-logloss:0.615800
## [3] train-logloss:0.583111
## [4] train-logloss:0.553685
## [5] train-logloss:0.527122
## [6] train-logloss:0.502362
## [7] train-logloss:0.479853
## [8] train-logloss:0.459333
## [9] train-logloss:0.440599
## [10] train-logloss:0.423455
## [11] train-logloss:0.407753
## [12] train-logloss:0.393341
## [13] train-logloss:0.379897
## [14] train-logloss:0.367717
## [15] train-logloss:0.356502
## [16] train-logloss:0.345976
## [17] train-logloss:0.336272
## [18] train-logloss:0.327322
## [19] train-logloss:0.319185
## [20] train-logloss:0.311454
## [21] train-logloss:0.304369
## [22] train-logloss:0.297899
## [23] train-logloss:0.291704
## [24] train-logloss:0.285937
## [25] train-logloss:0.280780
## [26] train-logloss:0.275775
## [27] train-logloss:0.271150
## [28] train-logloss:0.266889
## [29] train-logloss:0.262882
## [30] train-logloss:0.259146
## [31] train-logloss:0.255738
## [32] train-logloss:0.252496
## [33] train-logloss:0.249614
## [34] train-logloss:0.246908
## [35] train-logloss:0.244340
## [36] train-logloss:0.241954
## [37] train-logloss:0.239694
## [38] train-logloss:0.237592
## [39] train-logloss:0.235637
## [40] train-logloss:0.233745
## [41] train-logloss:0.231985
## [42] train-logloss:0.230135
## [43] train-logloss:0.228403
## [44] train-logloss:0.226756
## [45] train-logloss:0.225117
## [46] train-logloss:0.223642
## [47] train-logloss:0.222259
## [48] train-logloss:0.221016
## [49] train-logloss:0.219774
## [50] train-logloss:0.218607
## [51] train-logloss:0.217509
## [52] train-logloss:0.216475
## [53] train-logloss:0.215536
## [54] train-logloss:0.214651
## [55] train-logloss:0.213622
## [56] train-logloss:0.212296
## [57] train-logloss:0.211058
## [58] train-logloss:0.209902
## [59] train-logloss:0.208754
## [60] train-logloss:0.207676
## [61] train-logloss:0.206724
## [62] train-logloss:0.205769
## [63] train-logloss:0.204891
## [64] train-logloss:0.204088
## [65] train-logloss:0.203382
## [66] train-logloss:0.202656
## [67] train-logloss:0.201942
## [68] train-logloss:0.201267
## [69] train-logloss:0.200680
## [70] train-logloss:0.199982
## [71] train-logloss:0.199309
## [72] train-logloss:0.198704
## [73] train-logloss:0.198096
## [74] train-logloss:0.197508
## [75] train-logloss:0.196976
## [76] train-logloss:0.196479
## [77] train-logloss:0.196071
## [78] train-logloss:0.195596
## [79] train-logloss:0.195149
## [80] train-logloss:0.194728
## [81] train-logloss:0.194368
## [82] train-logloss:0.194031
## [83] train-logloss:0.193715
## [84] train-logloss:0.193316
## [85] train-logloss:0.192971
## [86] train-logloss:0.192726
## [87] train-logloss:0.192349
## [88] train-logloss:0.191983
## [89] train-logloss:0.191570
## [90] train-logloss:0.191291
## [91] train-logloss:0.190956
## [92] train-logloss:0.190691
## [93] train-logloss:0.190401
## [94] train-logloss:0.190088
## [95] train-logloss:0.189788
## [96] train-logloss:0.189543
## [97] train-logloss:0.189259
## [98] train-logloss:0.189018
## [99] train-logloss:0.188755
## [100] train-logloss:0.188534
#Evaluate
xgb_confusion <- confusionMatrix(as.factor(xgb_class_predictions), as.factor(as.numeric(y_test)))
xgb_confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82 5
## 1 6 57
##
## Accuracy : 0.9267
## 95% CI : (0.8726, 0.9628)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8491
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9318
## Specificity : 0.9194
## Pos Pred Value : 0.9425
## Neg Pred Value : 0.9048
## Prevalence : 0.5867
## Detection Rate : 0.5467
## Detection Prevalence : 0.5800
## Balanced Accuracy : 0.9256
##
## 'Positive' Class : 0
##
#the lift chart
xgb_gain <- gains(test_data$Sleep.Disorder, xgb_predictions, groups=10)
summary(xgb_gain)
## Length Class Mode
## depth 10 -none- numeric
## obs 10 -none- numeric
## cume.obs 10 -none- numeric
## mean.resp 10 -none- numeric
## cume.mean.resp 10 -none- numeric
## cume.pct.of.total 10 -none- numeric
## lift 10 -none- numeric
## cume.lift 10 -none- numeric
## mean.prediction 10 -none- numeric
## min.prediction 10 -none- numeric
## max.prediction 10 -none- numeric
## conf 1 -none- character
## optimal 1 -none- logical
## num.groups 1 -none- numeric
## percents 1 -none- logical
plot(xgb_gain)

xgb_preds <- as.numeric(xgb_predictions>0.5)
xgb_truth <- test_data$Sleep.Disorder
xgb_probs <- xgb_predictions
xgb_preds <- as.numeric(xgb_probs > 0.5)
xgb_nick <- cbind(xgb_truth, xgb_probs, xgb_preds)
# plot lift chart
plot(c(0,xgb_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,xgb_gain$cume.obs),
xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

# plot the xgboost tree
xgb.plot.tree(model=xgboost_model, trees = 0)
xgb.plot.tree(model=xgboost_model, trees = 1)
##7 SVC Model ---------------------
#Train
svc_model <- svm(Sleep.Disorder ~ ., data = train_data)
#Test
svc_predictions <- predict(svc_model, newdata = x_test)
svc_class_predictions <- ifelse(svc_predictions > 0.5, 1, 0)
#Evaluate
svc_confusion <- confusionMatrix(as.factor(svc_class_predictions), as.factor(as.numeric(y_test)))
svc_confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 79 6
## 1 9 56
##
## Accuracy : 0.9
## 95% CI : (0.8404, 0.9429)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7953
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.8977
## Specificity : 0.9032
## Pos Pred Value : 0.9294
## Neg Pred Value : 0.8615
## Prevalence : 0.5867
## Detection Rate : 0.5267
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.9005
##
## 'Positive' Class : 0
##
#the lift chart
svc_gain <- gains(test_data$Sleep.Disorder, svc_predictions, groups=10)
summary(svc_gain)
## Length Class Mode
## depth 10 -none- numeric
## obs 10 -none- numeric
## cume.obs 10 -none- numeric
## mean.resp 10 -none- numeric
## cume.mean.resp 10 -none- numeric
## cume.pct.of.total 10 -none- numeric
## lift 10 -none- numeric
## cume.lift 10 -none- numeric
## mean.prediction 10 -none- numeric
## min.prediction 10 -none- numeric
## max.prediction 10 -none- numeric
## conf 1 -none- character
## optimal 1 -none- logical
## num.groups 1 -none- numeric
## percents 1 -none- logical
plot(svc_gain)

svc_preds <- as.numeric(svc_predictions>0.5)
svc_truth <- test_data$Sleep.Disorder
svc_probs <- svc_predictions
svc_preds <- as.numeric(svc_probs > 0.5)
svc_nick <- cbind(svc_truth, svc_probs, svc_preds)
# plot lift chart
plot(c(0,svc_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder))~c(0,xgb_gain$cume.obs),
xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test_data$Sleep.Disorder))~c(0, dim(test_data)[1]), lty=2)

##8 Classification Tree ---------------------
#Train
train.ct <- rpart(Sleep.Disorder ~ .,data = train_data,method = "class",cp = 0,minsplit = 1)
#Plot
prp(train.ct,type = 1,extra = 1,split.font = 1,varlen = -10,box.palette = "#EBDEF0",border.col = "#4A235A")

#Inspect the cp table
train.ct$cptable
## CP nsplit rel error xerror xstd
## 1 0.78494624 0 1.0000000 1.0000000 0.07929945
## 2 0.03225806 1 0.2150538 0.2150538 0.04589054
## 3 0.01612903 2 0.1827957 0.1935484 0.04374847
## 4 0.01075269 5 0.1290323 0.1827957 0.04261894
## 5 0.00000000 6 0.1182796 0.1720430 0.04144620
#Train- Predict
train.ct.pred <- predict(train.ct,train_data,type = "class")
confusionMatrix(train.ct.pred, as.factor(train_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 127 7
## 1 4 86
##
## Accuracy : 0.9509
## 95% CI : (0.9138, 0.9752)
## No Information Rate : 0.5848
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8984
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9247
## Specificity : 0.9695
## Pos Pred Value : 0.9556
## Neg Pred Value : 0.9478
## Prevalence : 0.4152
## Detection Rate : 0.3839
## Detection Prevalence : 0.4018
## Balanced Accuracy : 0.9471
##
## 'Positive' Class : 1
##
#Test
test.ct.pred <- predict(train.ct,test_data,type = "class")
#Evaluate
confusionMatrix(test.ct.pred, as.factor(test_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 80 6
## 1 8 56
##
## Accuracy : 0.9067
## 95% CI : (0.8484, 0.948)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8085
##
## Mcnemar's Test P-Value : 0.7893
##
## Sensitivity : 0.9032
## Specificity : 0.9091
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.9302
## Prevalence : 0.4133
## Detection Rate : 0.3733
## Detection Prevalence : 0.4267
## Balanced Accuracy : 0.9062
##
## 'Positive' Class : 1
##
# Cross-validation in RPART using xval
cv.ct <- rpart(Sleep.Disorder ~ ., data = sleep_data1, method = "class",
cp = 0, minsplit = 1, xval = 10)
printcp(cv.ct)
##
## Classification tree:
## rpart(formula = Sleep.Disorder ~ ., data = sleep_data1, method = "class",
## cp = 0, minsplit = 1, xval = 10)
##
## Variables actually used in tree construction:
## [1] BMI.Category Gender Occupation Quality.of.Sleep
## [5] Stress.Level
##
## Root node error: 155/374 = 0.41444
##
## n= 374
##
## CP nsplit rel error xerror xstd
## 1 0.7741935 0 1.00000 1.00000 0.061464
## 2 0.0258065 1 0.22581 0.22581 0.036338
## 3 0.0150538 2 0.20000 0.20000 0.034400
## 4 0.0064516 5 0.15484 0.17419 0.032291
## 5 0.0032258 7 0.14194 0.15484 0.030575
## 6 0.0000000 9 0.13548 0.14839 0.029974
##9 Random Forest ------------------------
#Train
train.rf <- randomForest(as.factor(Sleep.Disorder) ~ .,
data = train_data,
ntree = 100,
importance = TRUE)
# plot classification error
plot(train.rf$err.rate[,1], type = "l", xlab = "Number of Trees", ylab = "Classification Error")

#Train-decide ntree = 60
train.rf <- randomForest(as.factor(Sleep.Disorder) ~ .,
data = train_data,
ntree = 60,
importance = TRUE)
#Variable Importance Plot
varImpPlot(train.rf, type = 1, main="Variable Importance")

#Test
rf.pred <- predict(train.rf, test_data)
rf.pred1 <- predict(train.rf, test_data, type = "prob")
#Evaluate
confusionMatrix(rf.pred, as.factor(test_data$Sleep.Disorder), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 80 5
## 1 8 57
##
## Accuracy : 0.9133
## 95% CI : (0.8564, 0.953)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8226
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.9194
## Specificity : 0.9091
## Pos Pred Value : 0.8769
## Neg Pred Value : 0.9412
## Prevalence : 0.4133
## Detection Rate : 0.3800
## Detection Prevalence : 0.4333
## Balanced Accuracy : 0.9142
##
## 'Positive' Class : 1
##
#the lift chart
positive_prob <- rf.pred1[,2]
actual_numeric <- as.numeric(as.factor(test_data$Sleep.Disorder)) - 1 # 将因子转换为 0 和 1
rf_gain <- gains(actual_numeric, positive_prob, groups=10)
summary(rf_gain)
## Length Class Mode
## depth 4 -none- numeric
## obs 4 -none- numeric
## cume.obs 4 -none- numeric
## mean.resp 4 -none- numeric
## cume.mean.resp 4 -none- numeric
## cume.pct.of.total 4 -none- numeric
## lift 4 -none- numeric
## cume.lift 4 -none- numeric
## mean.prediction 4 -none- numeric
## min.prediction 4 -none- numeric
## max.prediction 4 -none- numeric
## conf 1 -none- character
## optimal 1 -none- logical
## num.groups 1 -none- numeric
## percents 1 -none- logical
plot(c(0,rf_gain$cume.pct.of.total*sum(test_data$Sleep.Disorder)) ~ c(0,rf_gain$cume.obs),
xlab="# Cases", ylab="Cumulative", main="Lift Chart for Random Forest", type="l")
lines(c(0,sum(test_data$Sleep.Disorder)) ~ c(0, nrow(test_data)), lty=2)

##10 KNN ----------------------
#knn model
knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = 2)
knn_cm <- confusionMatrix(data = knn.pred, reference = factor(y_test, levels = c("0", "1")))
print(knn_cm$table)
## Reference
## Prediction 0 1
## 0 82 10
## 1 6 52
print(knn_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 82 10
## 1 6 52
##
## Accuracy : 0.8933
## 95% CI : (0.8326, 0.9378)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7779
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9318
## Specificity : 0.8387
## Pos Pred Value : 0.8913
## Neg Pred Value : 0.8966
## Prevalence : 0.5867
## Detection Rate : 0.5467
## Detection Prevalence : 0.6133
## Balanced Accuracy : 0.8853
##
## 'Positive' Class : 0
##
#find best k
accuracy.df <- data.frame(k = seq(1, 20, 1), accuracy = rep(0, 20))
for(i in 1:20) {
knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = i)
accuracy.df[i, 2] <- confusionMatrix(factor(knn.pred , levels = c(0,1)),
factor(factor(y_test, levels = c("0", "1")),
levels = c(0,1)))$overall[1]
}
plot(accuracy.df[,1], accuracy.df[,2], type = "l",
xlab = "k",
ylab = "Accuracy")

#best k =3
#knn model- use best k
knn.pred <- knn(train = train_data, test = test_data, cl = y_train, k = 3)
knn_cm <- confusionMatrix(data = knn.pred, reference = factor(y_test, levels = c("0", "1")))
print(knn_cm$table)
## Reference
## Prediction 0 1
## 0 79 10
## 1 9 52
print(knn_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 79 10
## 1 9 52
##
## Accuracy : 0.8733
## 95% CI : (0.8093, 0.922)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : 1.623e-14
##
## Kappa : 0.7382
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8977
## Specificity : 0.8387
## Pos Pred Value : 0.8876
## Neg Pred Value : 0.8525
## Prevalence : 0.5867
## Detection Rate : 0.5267
## Detection Prevalence : 0.5933
## Balanced Accuracy : 0.8682
##
## 'Positive' Class : 0
##
results_df <- data.frame(actual = as.numeric(as.factor(y_test)) - 1,
predicted = as.numeric(as.factor(knn.pred)) - 1)
results_df <- results_df[order(results_df$predicted, decreasing = TRUE),]
results_df$group <- cut(seq(1, nrow(results_df)), breaks = 10, labels = FALSE)
cumulative_response <- tapply(results_df$actual, results_df$group, function(x) sum(x))
cumulative_response_sum <- cumsum(cumulative_response)
plot(cumulative_response_sum, type = "o", xlab = "Group", ylab = "Cumulative Positive Responses",
main = "kNN Model Response Chart")

##11 Naive Bayes ----------------------
#Train
nb <- naiveBayes(x_train, y_train)
#Test
nb_predictions <- predict(nb, x_test)
#Evaluate
nb_confusionMatrix <- table(Predicted = nb_predictions, Actual = y_test)
print(nb_confusionMatrix)
## Actual
## Predicted 0 1
## 0 79 6
## 1 9 56
summary(nb_predictions)
## 0 1
## 85 65
# calculate sensitivity/specificity and accuracy
calculate_metrics <- function(confusionMatrix) {
sensitivity <- confusionMatrix[1, 1] / (confusionMatrix[1, 1] + confusionMatrix[2, 1])
specificity <- confusionMatrix[2, 2] / (confusionMatrix[2, 2] + confusionMatrix[1, 2])
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
return(list(sensitivity = sensitivity, specificity = specificity, accuracy = accuracy))
}
nb_metrics <- calculate_metrics(nb_confusionMatrix)
print(nb_metrics)
## $sensitivity
## [1] 0.8977273
##
## $specificity
## [1] 0.9032258
##
## $accuracy
## [1] 0.9
##12 Linear Discriminant Analysis ----------------------
#Train
lda <- lda(x_train, grouping = y_train)
#Test
lda_predictions <- predict(lda, x_test)$class
#Evaluate
lda_confusionMatrix <- table(Predicted = lda_predictions, Actual = y_test)
print(lda_confusionMatrix)
## Actual
## Predicted 0 1
## 0 79 5
## 1 9 57
lda_metrics <- calculate_metrics(lda_confusionMatrix)
print(lda_metrics)
## $sensitivity
## [1] 0.8977273
##
## $specificity
## [1] 0.9193548
##
## $accuracy
## [1] 0.9066667